***** Initial Clean Claims Data


clear 
clear matrix
set memory 1000m
set more off
cap log close


cd "/Users/..."

global do_file="‎⁨/Users/.../do_file"
global log_file="/Users/.../log_file"
global raw_data="/Users/.../raw_data⁩⁩"
global working_data="/Users/.../working_data"
global results="/Users/.../results"




***********************************
* Step 1: load data from excel file
***********************************




* paste claim_descriptions from the excel file


count if var1=="" & var2=="" & var3==""


drop if var1=="" & var2=="" & var3==""
replace var2=var3 if var2=="" & var3~=""

count if var2~=""

replace var1=var1+var2 if var2~=""


gen temp=substr(var1, 1, 1)

gen first_line=1 if temp==char(34)
drop temp

gen temp_length=length(var1)
gen temp=substr(var1, temp_length, 1)

gen last_line=1 if temp==char(34)

drop temp_length temp

gen var1_new=var1[_n+1] if first_line==1 & last_line==. & first_line[_n+1]==. & last_line[_n+1]==1


gen claim_details=var1
replace claim_details=var1+var1_new if var1_new~=""


count if last_line==1 & first_line==.

list var1 if last_line==1 & first_line==.

drop if last_line==1 & first_line==.

replace claim_details=claim_details+claim_details[_n+1]+claim_details[_n+2]+claim_details[_n+3] if _n==2240
drop if _n==2241 | _n==2242 | _n==2243

gen id=_n

keep id claim_details
order id claim_details

save raw_data/claim_details.dta, replace




* paste repair_details from the excel file

clear

rename var1 repair_details
gen id=_n

order id repair_details

save raw_data/repair_details.dta, replace



* paste other info from the excel file


clear

rename var1 vin
rename var3 labor_fee
rename var4 total_fee
rename var5 claim_date
rename var6 material_fee

gen id=_n


merge 1:1 id using raw_data/claim_details.dta
drop _merge


merge 1:1 id using raw_data/repair_details.dta
drop _merge

drop id

tab var2
drop var2

replace vin=vin[_n-1] if vin==""

order vin claim_date total_fee labor_fee material_fee

save raw_data/claims_raw.dta, replace

erase raw_data/claim_details.dta
erase raw_data/repair_details.dta




***********************************
* Step 2: Clean Claim Descriptions
***********************************


use raw_data/claims_raw.dta, clear


** first: get rid of the quotation marks
gen temp=substr(claim_details, 1, 1)
gen temp1=substr(claim_details, 2, .) if temp==char(34)

replace claim_details=temp1 if temp1~=""

drop temp temp1

gen temp_length=length(claim_details)
gen temp=substr(claim_details, temp_length, 1)
gen temp1=substr(claim_details, 1, temp_length-1) if temp==char(34)

replace claim_details=temp1 if temp1~=""
drop temp_length temp temp1

replace claim_details="" if claim_details=="空"


** split claim_details into two parts
gen temp_pos=strpos(claim_details, "/")

tab temp_pos, m

list claim_details if temp_pos==0
count if claim_details=="" 
// 10 values for claim_details is missing, line 152 does not make sense
replace claim_details="" if _n==152

gen temp1=substr(claim_details, temp_pos-1, 1)
gen temp2=substr(claim_details, temp_pos+1, 1)

tab temp1, m
tab temp2, m
// 11 missing values, all others are " "

list claim_details if temp1~="" & temp1~=" "
list claim_details if temp2~="" & temp2~=" "

drop temp1 temp2

gen first=substr(claim_details, 1, temp_pos-2)
tab first, m

gen second=substr(claim_details, temp_pos+2, .)
drop temp_pos

count if second==""  // 770 missing value for second
count if second==" " // 0 

gen second_orig=second // save a copy of second, do not change this variable

** gen variable for insurance company

gen ins_company="ABIC" if first=="ABIC"
replace ins_company="BOCI" if first=="BOCI"
replace ins_company="BPIC" if first=="BPIC"
replace ins_company="CCIC" if first=="CCIC" | first=="CCIC02"
replace ins_company="CICP" if first=="CICP"
replace ins_company="CLPC" if first=="CLPC02"
replace ins_company="CPIC" if first=="CPIC"
replace ins_company="GPIC" if first=="GPIC"
replace ins_company="HAIC" if first=="HAIC"
replace ins_company="PAIC" if first=="PAIC" | first=="PAIC02"
replace ins_company="PICC" if first=="PICC" | first=="PICC02"
replace ins_company="SPIC" if first=="SPIC"
replace ins_company="TAIC" if first=="TAIC"
replace ins_company="TPBX" if first=="TPBX"
replace ins_company="TPIC" if first=="TPIC"
replace ins_company="UTIC" if first=="UTIC"
replace ins_company="YGBX" if first=="YGBX" | first=="YGBX02"
replace ins_company="ZLIC" if first=="ZLIC"
replace ins_company="ZSIC" if first=="ZSIC"


** gen variable for liability insurance

gen temp_pos=strpos(second, "(交强)")
gen temp_pos2=strpos(second,"交强")

count if temp_pos==0 & temp_pos2>0
list second if temp_pos==0 & temp_pos2>0

gen jiaoq=1 if temp_pos>0
replace jiaoq=0 if jiaoq==. & second_orig~=""

tab jiaoq, m

gen temp_length=length(second)

gen temp=substr(second, 1, temp_pos-1) if temp_pos+7==temp_length & temp_pos~=0

count if temp==""  // 2349 missing values
count if temp==" " // 0

replace second=temp if temp_pos+7==temp_length & temp_pos~=0

count if second==""	   // 1138 
count if second==" "   // 0	

drop temp_pos temp_pos2 temp_length temp


** gen variable for no_reimburse and cancelled claims

gen temp_pos1=strpos(second, "(未理赔)")
gen no_reimburse=1 if temp_pos1>0
replace no_reimburse=0 if no_reimburse==. & second_orig~=""
tab no_reimburse, m

gen temp_pos2=strpos(second, "(注销)")
gen cancelled=1 if temp_pos2>0
replace cancelled=0 if cancelled==. & second_orig~=""
tab cancelled, m

gen temp_length=length(second)
gen temp=substr(second, 1, temp_pos1-1) if temp_pos1+10==temp_length & temp_pos1~=0 & temp_pos2==0
replace second=temp if temp_pos1+10==temp_length & temp_pos1~=0 & temp_pos2==0

count if second==""   // 1357
count if second==" "  // 0	

drop temp_length temp

gen temp_length=length(second)
gen temp=substr(second, 1, temp_pos2-1) if temp_pos2+7==temp_length & temp_pos2~=0 & temp_pos1==0
replace second=temp if temp_pos2+7==temp_length & temp_pos2~=0 & temp_pos1==0

count if second==""
count if second==" "

list second_orig temp if second==" "
replace second="" if second==" "

drop temp_length temp


count if temp_pos1~=0 & temp_pos2~=0
drop temp_pos1 temp_pos2

gen temp_pos1=strpos(second, "未理赔")
gen temp_pos2=strpos(second, "注销")

list second if temp_pos1>0
list second if temp_pos2>0  // has be identified as cancelled claims

drop temp_pos1 temp_pos2

** gen accident locations
gen temp_pos1=strpos(second, "在")
gen temp_pos2=strpos(second, "发生事故")
gen temp_pos3=strpos(second, "发生事故。")

count if temp_pos1~=0 & temp_pos2~=0 & temp_pos2~=temp_pos3
count if temp_pos2-temp_pos3~=0
drop temp_pos3

gen temp_length=length(second)


gen temp=substr(second, temp_pos1+3, .) if temp_pos1~=0 & temp_pos2~=0

gen temp_pos3=strpos(temp, "因")
gen accident_loc=substr(temp, 1, temp_pos3-1) if temp_pos3~=0

count if accident_loc=="" // 2530 missing values
count if accident_loc=="" & temp_pos1>0 // 127

list second if accident_loc=="" & temp_pos1>0

gen temp_pos5=strpos(second, "路") if accident_loc==""
drop temp
gen temp=substr(second, temp_pos1+3, .) if temp_pos1~=0 & temp_pos5~=0
replace accident_loc=substr(temp, 1, temp_pos5-3) if temp_pos5~=0 & accident_loc==""

count if accident_loc=="" & temp_pos1>0
list second if accident_loc=="" & temp_pos1>0


replace accident_loc="湖南省岳阳市华容县东山镇红岗村" if _n==110
replace accident_loc="广东省揭阳市普宁市S省道" if _n==449
replace accident_loc="云南省昆明市呈贡区斗南镇" if _n==553
replace accident_loc="广东省深圳市市辖区龙华新区" if _n==808
replace accident_loc="浙江省宁波市鄞州区东钱湖" if _n==1035
replace accident_loc="浙江省宁波市海曙区宝池新村" if _n==1053
replace accident_loc="广西壮族自治区南宁市青秀区天池山" if _n==1064
replace accident_loc="广西壮族自治区南宁市青秀区天池山小区" if _n==1065
replace accident_loc="广西壮族自治区南宁市宾阳县黎塘镇" if _n==1066
replace accident_loc="河南省平顶山市舞钢市朱兰" if _n==1175
replace accident_loc="辽宁省本溪市明山区客运站中心" if _n==1193
replace accident_loc="北京市市辖区朝阳区五里桥一街" if _n==1829
replace accident_loc="吉林省白山市八道江区银河商城" if _n==2217
replace accident_loc="江苏省南通市如皋市" if _n==2252
replace accident_loc="广元市利州区 映月湖小区" if _n==2417
replace accident_loc="永定区张桑高速" if _n==2452
replace accident_loc="江西省赣州市开发区隐龙山庄" if _n==2575
replace accident_loc="广东省广州市花都区新华街" if _n==2578
replace accident_loc="广东省东莞市市辖区 虎门镇shi gang 社区" if _n==2590
replace accident_loc="广东省东莞市市辖区南城区" if _n==2591
replace accident_loc="浙江省杭州市西湖区榨河巷" if _n==2616 | _n==2617
replace accident_loc="四川省成都市武侯区" if _n==2813 | _n==2814
replace accident_loc="成都市武侯区 天府长城·嘉南地" if _n==2815
replace accident_loc="浙江省温州市永嘉县瓯北镇" if _n==2817
replace accident_loc="浙江省金华市婺城区工贸街号" if _n==2836
replace accident_loc="江苏省徐州市新沂市新春小区中十二巷" if _n==2876
replace accident_loc="浙江省湖州市德清县秋山杭宁高速" if _n==2941


** clean accident type

gen temp1=substr(temp, temp_pos3+3, .)
gen temp_pos4=strpos(temp1, "发生事故")
gen accident_type=substr(temp1, 1, temp_pos4-1) if temp_pos4~=0

replace second="" if temp_pos1~=0 & temp_pos2~=0 & temp_pos2+14==temp_length

count if second==""  // 1901
count if second==" " // 0


drop temp_pos1 temp_pos2 temp_length temp temp1 temp_pos3 temp_pos4 temp_pos5


* adjust accident type


* collision, accident_type=1
gen temp1=strpos(second,"碰撞")
gen temp2=strpos(second,"碰")
gen temp3=strpos(second,"撞")
gen temp4=strpos(second,"剐蹭")
gen temp5=strpos(second,"剐")
gen temp6=strpos(second,"蹭")
gen temp7=strpos(second,"挂")
gen temp8=strpos(second,"擦")
gen temp9=strpos(second,"追尾")


egen temp_total=rowtotal(temp1-temp9)


gen accident_type_new=1 if temp_total>0

drop temp1-temp9 temp_total

tab accident_type, m
tab first, m

replace accident_type_new=1 if accident_type=="碰撞" | first=="碰撞" | first=="碰撞_车对车"


* glass: accident_type=2
gen temp1=strpos(second,"玻璃")
gen temp2=strpos(second,"前挡")
egen temp_total=rowtotal(temp1-temp2)

count if temp_total>0 & accident_type_new==1
list second if temp_total>0 & accident_type_new==1


replace accident_type_new=2 if temp_total>0 & accident_type_new==.
replace accident_type_new=2 if _n==705 | _n==1116 | _n==2417 | _n==2755
drop temp1-temp2 temp_total

tab accident_type, m
tab first, m
replace accident_type_new=2 if accident_type=="玻璃破碎" | first=="玻璃单独损坏" | first=="玻璃单独破碎" | first=="玻璃破碎"

* water: accident_type=3
gen temp1=strpos(second,"水淹")
gen temp2=strpos(second,"水")
gen temp3=strpos(second,"雨")

egen temp_total=rowtotal(temp1-temp3)

count if temp_total>0 & accident_type_new~=.
list second if temp_total>0 & accident_type_new~=.

replace accident_type_new=3 if temp_total>0 & accident_type_new==.
drop temp1-temp3 temp_total

tab accident_type, m
tab first, m

replace accident_type_new=3 if accident_type=="暴雨" | first=="暴雨" | first=="水淹车" | first=="自然灾害" | first=="雹灾"


* paint: accident_type=4
gen temp1=strpos(second,"划痕")
gen temp2=strpos(second,"划")
gen temp3=strpos(second,"油漆")
gen temp4=strpos(second,"漆")

gen temp5=strpos(second, "划分")


egen temp_total=rowtotal(temp1-temp4)

count if temp_total>0 & temp5==0 & accident_type_new~=.
list second accident_type_new if temp_total>0 & temp5==0 & accident_type_new~=.

replace accident_type_new=4 if temp_total>0 & temp5==0 & accident_type_new==.
drop temp1-temp5 temp_total


tab accident_type, m
tab first, m

replace accident_type_new=4 if accident_type=="车身划痕" | first=="划痕" | first=="油漆单独损伤" 

tab accident_type_new, m


* others: accident_type=5
replace accident_type_new=5 if accident_type_new==. & claim_details~=""




** police and medical
gen temp1=strpos(second,"警")

gen contact_police=1 if temp1>0
replace contact_police=0 if contact_police==. & second~=""
drop temp1

tab contact_police, m

gen temp1=strpos(second,"医")
gen temp2=strpos(second, "120")

list second if temp2>0

gen contact_med=1 if (temp1>0 | temp2>0) & _n~=2719 & _n~=2720 
replace contact_med=0 if contact_med==. & second~=""
drop temp1 temp2

tab contact_med, m




** involve other vehicles or people? 


* words related to others
gen temp1=strpos(second,"双车")
gen temp2=strpos(second,"行驶时碰撞")
gen temp3=strpos(second, "驾驶时碰撞")
gen temp4=strpos(second, "与2辆车碰撞")
gen temp5=strpos(second, "行驶碰行驶")
gen temp6=strpos(second, "碰商务车")
gen temp7=strpos(second, "刮到三轮摩托车")
gen temp8=strpos(second, "碰行人")
gen temp9=strpos(second, "碰撞电动车")
gen temp10=strpos(second, "与本田车相刮")
gen temp11=strpos(second, "与车相刮")
gen temp12=strpos(second, "与机动车相向碰撞")
gen temp13=strpos(second, "刮擦别车")
gen temp14=strpos(second, "电动车刮到本车")
gen temp15=strpos(second, "碰到三轮车")
gen temp16=strpos(second, "撞到电动车")
gen temp17=strpos(second, "碰小车")
gen temp18=strpos(second, "撞到对方车")
gen temp19=strpos(second, "剐蹭行人")
gen temp20=strpos(second, "本车刮擦电瓶车")
gen temp21=strpos(second, "与电瓶车刮擦")
gen temp22=strpos(second, "与机动车撞")
gen temp23=strpos(second, "停车时碰墙")
gen temp24=strpos(second, "剐行人")
gen temp25=strpos(second, "相擦")
gen temp26=strpos(second, "碰行人")
gen temp27=strpos(second, "因发生变道")
gen temp28=strpos(second, "碰到电动车")
gen temp29=strpos(second, "与电动车刮擦")
gen temp30=strpos(second, "擦电瓶车")
gen temp31=strpos(second, "开门时撞电动车")
gen temp32=strpos(second, "与一辆摩托车相剐")
gen temp33=strpos(second, "撞到行人") 
gen temp34=strpos(second, "行驶时碰电动车") 
gen temp35=strpos(second, "因发生倒车") 
gen temp36=strpos(second, "撞自行车") 
gen temp37=strpos(second, "与三者货车擦") 
gen temp38=strpos(second, "电瓶车又碰到本车") 
gen temp39=strpos(second, "与电动三轮车相擦") 
gen temp40=strpos(second, "与摩托车擦") 
gen temp41=strpos(second, "与电动车碰撞") 
gen temp42=strpos(second, "和自行车相蹭") 
gen temp43=strpos(second, "行驶撞车") 
gen temp44=strpos(second, "倒车碰对方车辆") 
gen temp45=strpos(second, "倒车撞一辆停放的宝马") 
gen temp46=strpos(second, "撞别克") 
gen temp47=strpos(second, "本车碰别克") 
gen temp48=strpos(second, "碰撞到骑电瓶车的人") 
gen temp49=strpos(second, "并线时碰") 
gen temp50=strpos(second, "蹭门") 
gen temp51=strpos(second, "停放中被撞") 
gen temp52=strpos(second, "倒车刮到他车") 
gen temp53=strpos(second, "标的车左转与1辆车碰撞") 
gen temp54=strpos(second, "两车")
gen temp55=strpos(second, "三车")
gen temp56=strpos(second, "四车")
gen temp57=strpos(second, "多方")
gen temp58=strpos(second, "追尾")
gen temp59=strpos(second, "撞行人")
gen temp60=strpos(second, "三者车")



* words not related to others
gen temp61=strpos(second, "单方车损")
gen temp62=strpos(second, "行驶时碰撞电线杆")
gen temp63=strpos(second, "行驶时碰撞到石墩") 
gen temp64=strpos(second, "行驶时碰撞石墩") 
gen temp65=strpos(second, "行驶时碰撞石台") 
gen temp66=strpos(second, "行驶时碰撞护栏") 
gen temp67=strpos(second, "行驶时碰撞石头") 
gen temp68=strpos(second, "行驶时碰撞摄像头柱子") 
gen temp69=strpos(second, "行驶时碰撞铁架子") 




egen temp_total1=rowtotal(temp1-temp59)
egen temp_total2=rowtotal(temp61-temp69)

count if temp_total1==0 & temp60>0
list second if temp_total1==0 & temp60>0

gen involve_others=1 if temp_total1>0
replace involve_other=1 if temp_total1==0 & temp60>0 & accident_type_new==1
replace involve_other=0 if temp_total2>0
replace involve_other=0 if involve_other==. & second~="" 


tab involve_others, m
// "." represent for missing value

drop temp1-temp69 temp_total1 temp_total2


** responsibility

gen temp1=strpos(second, "本车全责")
gen temp2=strpos(second, "全责") 
gen temp3=strpos(second, "责任划分为全责") 
gen temp4=strpos(second, "本车主责") 

gen temp5=strpos(second, "被追尾")
gen temp6=strpos(second, "逃逸")
gen temp7=strpos(second, "责任划分为无责") 


egen temp_total1=rowtotal(temp1-temp4)
egen temp_total2=rowtotal(temp5-temp7)


gen at_fault=1 if temp_total1>0 & involve_others==1
replace at_fault=0 if temp_total2>0 & involve_others==1
replace at_fault=1 if involve_others==0
replace at_fault=-1 if at_fault==. & involve_others==1 

drop temp1-temp7 temp_total1 temp_total2

tab at_fault, m
// -1 represent for "not sure"
// "." represent for missing value


** remove space at the end of the string

gen temp_length=length(second)
gen temp1=substr(second, 1, 1) if temp_length>0

tab temp1 if temp_length>0, m

gen temp2=substr(second, temp_length, 1) if temp_length>0

count if temp2==" " // 215
count if temp2==""	// 1901

gen second_new=substr(second, 1, temp_length-1) if temp2==" " 
replace second=second_new if temp2==" "
drop second_new temp1 temp2 temp_length


** human injury

gen temp1=strpos(second, "1人伤")
gen temp2=strpos(second, "一人伤")
gen temp3=strpos(second, "三者人伤:1")
gen temp4=strpos(second, "伤者有伤")
gen temp5=strpos(second, "2人伤")
gen temp6=strpos(second, "行人受伤")
gen temp7=strpos(second, "一位人伤")
gen temp8=strpos(second, "两人伤")
gen temp9=strpos(second, "1有伤")
gen temp10=strpos(second, "本车有人伤")
gen temp11=strpos(second, "三者行人受伤")
gen temp12=strpos(second, "三责1男受伤")

gen temp13=strpos(second, "无人伤")
gen temp14=strpos(second, "无物损人伤")
gen temp15=strpos(second, "碰三者人伤：无")
gen temp16=strpos(second, "未涉及人伤")
gen temp17=strpos(second, "没有人伤")
gen temp18=strpos(second, "无其他人伤")
gen temp19=strpos(second, "无物损和人伤")


egen temp_total1=rowtotal(temp1-temp12)
egen temp_total2=rowtotal(temp13-temp19)


gen human_injury=1 if temp_total1>0
replace human_injury=0 if temp_total2>0
replace human_injury=-1 if human_injury==. & second~=""

drop temp1-temp19 temp_total1 temp_total2

tab human_injury, m
// -1 represent for "not sure"
// "." represent for missing value



** other damage

gen temp1=strpos(second, "物损")
gen temp2=strpos(second, "其他损失")
gen temp3=strpos(second, "他损")
gen temp4=strpos(second, "损失")
gen temp5=strpos(second, "有损")


gen temp6=strpos(second, "碰三者物损：墙无损")
gen temp7=strpos(second, "无物损")
gen temp8=strpos(second, "本车物损：无")
gen temp9=strpos(second, "无物损人伤")
gen temp10=strpos(second, "无人伤物损")
gen temp11=strpos(second, "无物损")
gen temp12=strpos(second, "碰三者物损：墩子无损")
gen temp13=strpos(second, "碰三者物损：台阶无损")
gen temp14=strpos(second, "未涉及人伤及物损")
gen temp15=strpos(second, "碰三者物损：柱子无损")
gen temp16=strpos(second, "碰三者物损：无")
gen temp17=strpos(second, "碰三者物损：树无损")
gen temp18=strpos(second, "无其他物损")
gen temp19=strpos(second, "碰三者物损：石墩无损")
gen temp20=strpos(second, "电瓶车无损")
gen temp21=strpos(second, "无人伤、物损")
gen temp22=strpos(second, "无人伤和物损")
gen temp23=strpos(second, "无三者物损")
gen temp24=strpos(second, "碰三者物损：花坛无损")
gen temp25=strpos(second, "没有人伤、物损")
gen temp26=strpos(second, "无其他人伤及物损")
gen temp27=strpos(second, "无人伤及其他物损")
gen temp28=strpos(second, "无其他损失")
gen temp29=strpos(second, "无其他损")
gen temp30=strpos(second, "无人伤及其他损失")
gen temp31=strpos(second, "无他损")
gen temp32=strpos(second, "无其他损坏")
gen temp33=strpos(second, "护栏没有损失")


gen temp34=strpos(second, "物损:电动车")
gen temp35=strpos(second, "物损,碰倒闸需赔")
gen temp36=strpos(second, "碰三者物损：电瓶车有损")
gen temp37=strpos(second, "碰三者物损：三轮车有损")
gen temp38=strpos(second, "碰三者物损：垃圾桶有损")

egen temp_total1=rowtotal(temp6-temp33)
egen temp_total2=rowtotal(temp34-temp38)

gen other_damage=1 if temp_total2>0
replace other_damage=0 if temp_total1>0
replace other_damage=-1 if other_damage==. & second~=""

drop temp1-temp38 temp_total1 temp_total2


tab other_damage, m


** manually generate own damage and opp damage
gen id=_n
preserve

keep id vin second involve_others
order id vin second
bysort vin second: keep if _n==1
keep if second~=""

export delimited using "/Users/.../working_data/claim_details_070219.csv", replace

restore


save working_data/claim_temp_070119.dta, replace


** merge with own damage and opp damage

use working_data/claim_temp_070119.dta, clear

merge 1:1 id using working_data/claim_details_070219.dta
drop _merge

bysort vin second: egen own_damage_new=total(own_damage)
replace own_damage_new=. if second==""

bysort vin second: egen opp_damage_new=total(opp_damage)
replace opp_damage_new=. if second=="" 


drop own_damage opp_damage
rename own_damage_new own_damage
rename opp_damage_new opp_damage


count if repair_details=="" 

gen repair_details_new=subinstr(repair_details, "，", "", .)
gen temp_length1=length(repair_details)
gen temp_length2=length(repair_details_new)

gen num_parts_repaired=(temp_length1-temp_length2)/3+1 if repair_details~="" 

drop temp_length1 temp_length2 
drop repair_details_new

sort id
rename id id_orig



save working_data/claim_clean_070219.dta, replace































